********************************************************************************

* Filename: Balance_Tables.do

********************************************************************************
*  This do-file is part of the collection of replication files for Schaffner, Glewwe, and Sharma, 
*  "Why Programs Fail:  Lessons for Improving Public Service Quality from a Mixed Methods Evaluation
*   of an Unsuccessful Teacher Training Program in Nepal"
*
*This do-file calculates:
*      - all results in Online Appendix Tables S1 and S2
*    
* Software version used: STATA/SE 18.0
********************************************************************************
*GLOBAL FILE PATH DEFINITIONS
	global PBRfolder "ADD FILE REFERENCE TO MAIN FOLDER HERE" 
	global datasets = "$PBRfolder\Datasets"
	global logs = "$PBRfolder\Logs"
	
********************************************************************************
* SET-UP
	set more off
	clear all
	set varabbrev on 
	capture log close
	log using "$logs\Balance_Tables", replace
	cd "$datasets"
	
********************************************************************************

*Create temporary files for all the baseline datasets

********************************************************************************

	clear
	use HTschoolbaseline
	sort schoolid
	tempfile tempHTschool
	save "`tempHTschool'", replace

	clear
	use HTteacherbaseline
	sort schoolid T_SERNO
	tempfile tempHTteacher
	save "`tempHTteacher'", replace

	clear
	use TeacherBaseline.dta
	sort schoolid T_SERNO
	tempfile tempTeacher
	save "`tempTeacher'", replace

	clear
	use Grade08baseline
	sort schoolid serial
	tempfile tempgrade8
	save "`tempgrade8.dta'", replace

	clear
	use Grade09baseline
	sort schoolid serial
	tempfile tempgrade9
	save "`tempgrade9.dta'", replace
	
********************************************************************************

*   Balance checks for school characteristics from baseline head teacher questionnaire

********************************************************************************
	
	use "`tempHTschool'"
	
	*Merge with basicdata
		merge 1:1 schoolid using basicdata
		drop _merge
		//0 unmatched
	
	*Define survey structure
		*Create new strata variable = district*10+stratum
		gen dist_stratum = (district*10)+stratum
		*Define survey structure
		svyset schoolid [pweight=sch_wght], strata(dist_stratum)
	
	*Prepare variables relevant for balance checks  (includes some not used in table)
		*1. Total number of students in school
			drop size
			gen size=total_all
			la var size "Total number of students in school"
		*2. Hours walking to nearest all-weather road (hourstoroad)
			la var hourstoroad "No. of hours walk to the nearest all-weather road"
		*3, 4. Students per section in grades 9 and 10 (stupersec9 stupersec10)
			la var stupersec9 "Students per section in Grade 9"
			la var stupersec10 "Students per section in Grade 10"
		*5. Days school was open last year (grade 9) (dayspyear09)
			la var dayspyear09 "No. of days school was open last year (Grade 9)"	
		*6. Whether school has electricity at least several hours most days (electricity)
			la var electricity "Whether school has electricity at least several hours most days"
		*7. Whether head teacher has at least masters degree
			tab HTtopdegree, mi
			label list C13
			gen byte ht_mastersdegree=HTtopdegree==3 | HTtopdegree==4
			la var ht_mastersdegree "Whether head teacher has masters degree"
		*8. Hours per week head teacher teaches (including average hours as substitute, using grade 9 minutes per period)
			summ HT_n_periods 
			replace HTsub_periods=0 if HTsubs==0
			summ HTsub_periods
			gen HTteach_per=HT_n_periods+HTsub_periods
			summ minpperiod09
			gen HTteach_min=HTteach_per*minpperiod09
			gen HTteach_hr=HTteach_min/60
			gen HTteach_min_g10=HTteach_per*minpperiod10
			gen HTteach_hr_g10=HTteach_min_g10/60
			summ HTteach_hr HTteach_hr_g10 //statistic is nearly the same whether we use grade 9 or grade 10 hours. Use grade 9 hours for now.
			la var HTteach_hr "No. of hours per week head teacher teaches"
		*9. School management quality index (with footnote to see definition in appendix to preanalysis plan)
			merge 1:1 schoolid using management, keepusing(theta1)
			//2 schools unmatched from master data, 2 missing values
			la var theta1 "School management quality index"
		

		*Create global lists of school-level variables
			
			*Dummy variables
			global schoolvars_bin "electricity ht_mastersdegree"
			
			*Continuous variables
			global schoolvars_con "size hourstoroad stupersec9 stupersec10 HTteach_hr dayspyear09 theta1"

		*Define assignment to treatment variable
			gen treat = studyarm==1 | studyarm==2
			
			save "`tempHTschool'", replace
					
		* Create variables for collecting p-values, for later calculation of q-values
		local j=0
		gen psave=.
		gen variable=" "	
	
	* Calculations for appendix table S1, top panel (school-level characteristics)	
	
	* Continuous variables
		foreach var in $schoolvars_con {
			svy, over(treat):mean `var', cformat(%9.1f)
			estat sd 
			mat sd1=r(sd)
			di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
			svy:reg `var' treat district#stratum
			test treat
			di %9.3f r(p)
			local j = `j' + 1
			replace psave= r(p) if _n==`j'
			}
			
	* Binary variables
		foreach var in $schoolvars_bin {
			svy, over(treat):mean `var', cformat(%9.1f)
			estat sd 
			mat sd1=r(sd)
			di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
			svy:probit `var' treat district#stratum
			test treat
			di %9.3f r(p)
			local j = `j' + 1
			replace psave= r(p) if _n==`j'
			}
		
	* save pvalues for later qvalue calculation
		keep if _n<=`j'
		keep variable psave
		tempfile psave1
		save "`psave1'", replace
		list variable psave
	

********************************************************************************
	
**** Balance for grade 9 and 10 math and science teacher characteristics
	
********************************************************************************
	
		use "`tempTeacher'", clear
	
		*Merge with basicdata
			merge m:1 schoolid using basicdata
			//2 unmatched from using dataset
		
		*Define survey data structure
			*Create new strata variable = district*10+stratum
			gen dist_stratum = (district*10)+stratum
			*Describe the survey data
			svyset schoolid [pweight=sch_wght], strata(dist_stratum)
		
		*Prepare variables relevant for balance checks
			*1. Whether teacher is female
				tab T_SEX, mi
				gen byte t_female=T_SEX==2
				la var t_female "Whether teacher is female"
			*2. Whether teacher has at least bachelors degree in math or science
				tab highestedmathsci, mi
				label list T10_A
				gen byte t_atleastbach=(highestedmathsci>=2 & highestedmathsci<5)
				replace t_atleastbach=. if highestedmathsci==5 | missing(highestedmathsci)
				la var t_atleastbach "Whether teacher has at least bachelors degree in math or science"		
			*3. Whether teacher reports having had SSRP math or science training
				tab tr_SSRPmath, mi
				tab tr_SSRPscience, mi
				gen byte t_ssrp=(tr_SSRPmath==1 | tr_SSRPscience==1)
				la var t_ssrp "Whether teacher reported receiving SSRP math or science training"
			*4. Years of experience as a teacher (experience)
				la var experience "No. of years of experience as a teacher"
			*5. Hours per day teacher preps for class (hrs_prep)
				la var hrs_prep "No. of hours per day teacher prepares for class"

		*Create a global list of teacher level variables
			*Dummy vars
			global teachervars_bin "t_female t_atleastbach t_ssrp"
			*Continuous vars
			global teachervars_con "experience hrs_prep"
			
		*Define assignment to treatment variable
			gen treat = studyarm==1 | studyarm==2
			
			save "`tempTeacher'", replace
					
		*Create variable for saving pvalues for later qvalue calculation
			gen psave=.
			local j=0
			gen variable=" " 
			
		* Calculations for appendix table S1, bottom panel (teacher-level characteristics)	
	
			* Continuous variables
			foreach var in $teachervars_con {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:reg `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
			
			* Binary variables
			foreach var in $teachervars_bin {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:probit `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
					
		*save p-values
			keep if _n<=`j'
			keep variable psave
			list
			tempfile psave2
			save "`psave2'", replace
	
		
********************************************************************************

****Balance for grade 8 and 9 student characteristics

********************************************************************************

		use "`tempgrade8'", clear
		append using "`tempgrade9'"
		
		*Merge with basicdata
			merge m:1 schoolid using basicdata
			//0 unmatched
		
		*Define the survey data structure
			*Create new strata variable = district*10+stratum
			gen dist_stratum = (district*10)+stratum
			*Describe the survey data
			svyset schoolid [pweight=sch_wght], strata(dist_stratum)
	
		*Prepare variables relevant for balance checks
			*1. Whether female 
				tab gender, mi
				label list C8_Q1
				gen byte female=gender==2
				la var female "Whether student reports being female"
			*2. Whether father is literate (f_can_read)
				la var f_can_read "Whether student's father can read and write"
			*3. Whether father has at least secondary ed
				gen byte f_sec=(f_educlevel>=3 & f_educlevel<6)
				replace f_sec=. if f_educlevel==6 | f_educlevel==9
				replace f_sec=0 if f_everschooled==0
				la var f_sec "Whether student's father has at least secondary education"
			*4. Whether mother is literate (m_can_read)
				la var m_can_read "Whether student's mother can read and write"
			*5. Whether mother has at least secondary ed
				gen byte m_sec=(m_educlevel>=3 & m_educlevel<6)
				replace m_sec=. if m_educlevel==6 | m_educlevel==9
				replace m_sec=0 if m_everschooled==0
				la var m_sec "Whether student's mother has at least secondary education"
			*6. Whether Nepalese is the main language spoken at home (nepali_spokenhome)
				la var nepali_spokenhome "Whether Nepalese is the main language spoken at home"
			*7. Asset index – with footnote describing construction 
				summ family_phone family_tv family_bicycle family_scooter family_refrigerator family_computer
				alpha family_*, asis item  /*family_phone is not well correlated, try dropping */
				alpha family_tv family_bicycle family_scooter family_refrigerator family_computer, asis item
				/* This is not a very coherent set of items. */
				pca family_tv family_bicycle family_scooter family_refrigerator family_computer
				*screeplot 
				irt 2pl family_tv family_bicycle family_scooter family_refrigerator family_computer
				predict assetindex, latent
				gen rawasset=family_tv+family_bicycle+family_scooter+family_refrigerator+family_computer
				corr assetindex rawasset
				la var assetindex "Asset index of student's family (using IRT)"
				la var rawasset "Asset index of student's family"

			*Create a global list of school level variables
				*Dummy vars
				global studentvars_bin "female f_can_read f_sec m_can_read m_sec nepali_spokenhome"
				*Continuous vars
				global studentvars_con assetindex
				
			*Define assignment to treatment variable
				gen treat = studyarm==1 | studyarm==2
				tempfile tempstudent				
				save "`tempstudent'", replace
					
			*Create variable for saving pvalues for later qvalue calculation
				gen psave=.
				local j=0
				gen variable=" " 
			
		* Calculations for appendix table S2 (first 7 rows)
	
			* Continuous variables
			foreach var in $studentvars_con {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:reg `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
			
			* Binary variables
			foreach var in $studentvars_bin {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:probit `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
	
			*save p-values
				keep if _n<=`j'
				keep variable psave
				list
				tempfile psave3
				save "`psave3'", replace
	
	
********************************************************************************

**** Balance for grade 8 baseline test scores
	
********************************************************************************
	
		use "`tempgrade8'", clear
		
		*Merge with basicdata
		merge m:1 schoolid using basicdata
		//0 unmatched
				
		*Define survey data structure
			*Create new strata variable = district*10+stratum
			gen dist_stratum = (district*10)+stratum
			*Describe the survey data
			svyset schoolid [pweight=sch_wght], strata(dist_stratum)
		
		*Generate any new variables relevant for balance checks
			*1. Grade 8 math raw score
				egen math8raw=rowtotal(ma1-ma20 mb1-mc8) 
				summ math8raw, detail
				gen math8raw_percent=(math8raw/45)*100
				summ math8raw_percent, detail
				la var math8raw_percent "Grade 8 math test score (percentage score)"

			*2. Grade 8 math IRT
				tab1 mb1 mb2 mb3 mb4 mb5 mc6 mc7 mc8 /* Problems with mb2, mb3, mb4 & mb7 */
				foreach var in mb2 mb3 mb4 mc7 {
					replace `var'=2*`var'
					}
				alpha ma1-ma20 mb1-mc8, asis item
				*pca ma1-ma20 mb1-mc8
				*screenplot
				irt hybrid (2pl ma1-ma20) (grm mb1-mc8), vce(cluster schoolid)
				irtgraph tif
				predict math8, latent
				summ math8, detail
				la var math8 "Grade 8 math test score (IRT latent variable)"
				
			*3. Grade 8 science raw score
				gen sc1=sc1a+sc1b+sc1c
				gen sc2=sc2a+sc2b+sc2c
				egen sci8raw=rowtotal(sa1-sa20 sb1-sb5 sc1 sc2)
				summ sci8raw, detail
				gen sci8raw_percent=(sci8raw/50)*100
				summ sci8raw_percent, detail
				la var sci8raw_percent "Grade 8 science test score (percentage score)"

			*4.  Grade 8 science IRT
				tab1 sb1 sb2 sb3 sb4 sb5 sc1 sc2 /* Problems with all except sb4 */
				foreach var in sb1 sb2 sb3 sb5 sc1 sc2 {
					replace `var'=2*`var'
					}
				alpha sa1-sa20 sb1 sb2 sb3 sb4 sb5 sc1 sc2, asis item
				*pca sa1-sa20 sb1-sb5 sc1 sc2
				*screeplot
				irt hybrid (2pl sa1-sa20) (grm sb1 sb2 sb3 sb4 sb5 sc1 sc2), vce(cluster schoolid)
				irtgraph tif
				predict sci8, latent
				la var sci8 "Grade 8 science test score (IRT latent variable)"

		save "`tempgrade8'", replace
		
		*Create a global list of grade 8 student level variables
			global scoreg8vars "math8raw_percent math8 sci8raw_percent sci8"
		
		*Define assignment to treatment variable
				gen treat = studyarm==1 | studyarm==2
				tempfile tempstudent				
				save "`tempstudent'", replace
					
		*Create variable for saving pvalues for later qvalue calculation
				gen psave=.
				local j=0
				gen variable=" " 
			
		* Calculations for appendix table S2 (rows 8 through 11)
	
			* grade 8 score vars
			foreach var in $scoreg8vars {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:reg `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
			
			*save p-values
				keep if _n<=`j'
				keep variable psave
				tempfile psave4
				save "`psave4'", replace
	
		
********************************************************************************
		
**** Balance for grade 9 baseline test scores
		
********************************************************************************
		
		use "`tempgrade9'", clear
		
		*Merge with basicdata
			merge m:1 schoolid using basicdata
			//0 unmatched
			
		*Define survey data structure
			*Create new strata variable = district*10+stratum
			gen dist_stratum = (district*10)+stratum
			*Describe the survey data
			svyset schoolid [pweight=sch_wght], strata(dist_stratum)
		
		*Generate any new variables relevant for balance checks	
			gen sb4=sb4a+sb4b
			gen sb5=sb5a+sb5b
			gen sc1=sc1a+sc1b+sc1c
			gen sc2=sc2a+sc2b
			macro define m_items "ma1-ma9 ma10-ma20 mb1-mb5 mc6-mc8"
			macro define s_items "sa1-sa20 sb1 sb2 sb3 sb4 sb5 sc1 sc2"

		*1. Grade 9 math raw score
			egen math9raw=rowtotal($m_items)
			summ math9raw, detail
			gen math9raw_percent=(math9raw/45)*100
			summ math9raw_percent, detail
			la var math9raw_percent "Grade 9 math test score (percentage score)"

		*2. Grade 9 math IRT
			tab1 mb1 mb2 mb3 mb4 mb5 mc6 mc7 mc8 /* All are OK except mb4 */
			foreach var in mb4 {
				replace `var'=2*`var'
				}

			alpha $m_items, asis item
			*pca $m_items
			*screeplot
			irt hybrid (2pl ma1-ma20) (grm mb1-mc8), vce(cluster schoolid)
			irtgraph tif
			predict math9, latent
			la var math9 "Grade 9 math test score (IRT latent variable)"

		*3. Grace 9 science raw scores
			egen sci9raw=rowtotal($s_items)
			summ sci9raw, detail
			gen sci9raw_percent=(sci9raw/50)*100
			summ sci9raw_percent, detail
			la var sci9raw_percent "Grade 9 science test score (percentage score)"

		*4. Grade 9 science IRT
			tab1 sb1 sb2 sb3 sb4 sb5 sc1 sc2 /* Problems with sb1, sb3, sb4, sc1 & sc2 */
			foreach var in sb1 sb3 sb4 sc1 sc2 {
				replace `var'=2*`var'
				}

			alpha sa1-sa20 sb1 sb2 sb3 sb4 sb5 sc1 sc2, asis item
			*pca sa1-sa20 sb1-sb5 sc1 sc2
			*screeplot
			irt hybrid (2pl sa1-sa20) (grm sb1 sb2 sb3 sb4 sb5 sc1 sc2), vce(cluster schoolid)
			irtgraph tif
			predict sci9, latent
			la var sci9 "Grade 9 science test score (IRT latent variable)"

			save "`tempgrade9'", replace
		
		*Create a global list of grade 9 student level variables
			global scoreg9vars "math9raw_percent math9 sci9raw_percent sci9"

		*Define assignment to treatment variable
				gen treat = studyarm==1 | studyarm==2
				tempfile tempstudent				
				save "`tempstudent'", replace
					
		*Create variable for saving pvalues for later qvalue calculation
				gen psave=.
				local j=0
				gen variable=" " 
			
		* Calculations for appendix table S2 (rows 12 through 15)
	
			* grade 9 score vars
			foreach var in $scoreg9vars {
				svy, over(treat):mean `var', cformat(%9.1f)
				estat sd 
				mat sd1=r(sd)
				di %9.1f sd1[1,1] "  " %9.1f sd1[1,2]
				svy:reg `var' treat district#stratum
				test treat
				di %9.3f r(p)
				local j = `j' + 1
				replace psave= r(p) if _n==`j'
				}
			
			*save p-values
				keep if _n<=`j'
				keep variable psave
				tempfile psave5
				save "`psave5'", replace

********************************************************************************

**** Multiple hypothesis testing corrections

********************************************************************************
	
			use "`psave1'", clear
			append using "`psave2'"
			append using "`psave3'"
			append using "`psave4'"
			append using "`psave5'"
			qqvalue psave , method(yekutieli) qvalue(qval)
			list variable psave qval
			
log close